using <- function(...) {
libs <- unlist(list(...))
need <- libs <- libs[!unlist(lapply(libs, require, character.only = T))]
if(length(need) > 0){
install.packages(need, repos = "https://cloud.r-project.org")
need <- need[!unlist(lapply(need, require, character.only = T))]
if (length(need) > 0) {
if (!requireNamespace("BiocManager", quietly = T))
install.packages("BiocManager")
BiocManager::install(need)
}
}
lapply(libs, require, character.only = T)
}
using("data.table", "tidyverse", "smacof", "DESeq2", "InteractionSet", "diffHic",
"broom", "highcharter", "heatmaply", "plotly", "eulerr")We’ll have to to read in some data, though it’s already been done and the output saved in lec12.RData.
load('lec12.RData')This “Import” section doesn’t need to be ran in class, they’re just for your reference
While we have one correlation coefficient per chromosome, we can take the average (weighted by chromosome length) to obtain a representative value per pairwise comparison
# relative size of each chromosome
f <- fread('mm10.chrom.sizes')$V2 %>% {. / sum(.)}
# we take the weighted average of SCC across chromosomes
hr <- list.files('hicrep', full.names = T) %>%
setNames(., sub('.txt', '', basename(.))) %>%
sapply(function(x) sum(fread(x, skip = 2)$V1 * f)) %>%
data.frame(c = .) %>%
rownames_to_column('p') %>%
separate(p, c('s1', 's2'), '\\.') %>%
pivot_wider(names_from = s2, values_from = 'c') %>%
column_to_rownames('s1')| ESC_1 | ESC_2 | ESC_3 | ESC_4 | ESC | NPC_1 | NPC_2 | NPC_3 | NPC_4 | NPC | |
|---|---|---|---|---|---|---|---|---|---|---|
| ESC_1 | 1 | 0.983 | 0.953 | 0.966 | 0.985 | 0.801 | 0.804 | 0.826 | 0.825 | 0.821 |
| ESC_2 | 0.983 | 1 | 0.985 | 0.988 | 0.997 | 0.856 | 0.858 | 0.879 | 0.878 | 0.875 |
| ESC_3 | 0.953 | 0.985 | 1 | 0.99 | 0.99 | 0.891 | 0.898 | 0.91 | 0.912 | 0.91 |
| ESC_4 | 0.966 | 0.988 | 0.99 | 1 | 0.992 | 0.884 | 0.889 | 0.899 | 0.901 | 0.9 |
| ESC | 0.985 | 0.997 | 0.99 | 0.992 | 1 | 0.861 | 0.866 | 0.883 | 0.884 | 0.88 |
| NPC_1 | 0.801 | 0.856 | 0.891 | 0.884 | 0.861 | 1 | 0.978 | 0.973 | 0.971 | 0.983 |
| NPC_2 | 0.804 | 0.858 | 0.898 | 0.889 | 0.866 | 0.978 | 1 | 0.99 | 0.99 | 0.996 |
| NPC_3 | 0.826 | 0.879 | 0.91 | 0.899 | 0.883 | 0.973 | 0.99 | 1 | 0.996 | 0.997 |
| NPC_4 | 0.825 | 0.878 | 0.912 | 0.901 | 0.884 | 0.971 | 0.99 | 0.996 | 1 | 0.997 |
| NPC | 0.821 | 0.875 | 0.91 | 0.9 | 0.88 | 0.983 | 0.996 | 0.997 | 0.997 | 1 |
Instead of the complete output, we just take the logbin summarized files for visualization
ps <- list.files('exp/1000/cis', full.names = T) %>%
grep('der|log', ., value = T) %>%
split(., sub('.*\\.(.*)\\.tsv', '\\1', .)) %>%
lapply(function(x) {
split(x, sub('\\..*', '', basename(x))) %>%
lapply(fread)
})
head(ps$log$ESC)ins <- list.files('ins/10000/100000', pattern = 'tsv', full.names = T) %>%
grep('ESC|NPC', ., value = T) %>%
setNames(., sub('.tsv', '', basename(.))) %>%
lapply(fread)
head(ins$ESC)The outputs from cooltools call-dots are directly read in
dots <- list.files('dots', pattern = 'postproc', full.names = T) %>%
setNames(., sub('\\..*', '', basename(.))) %>%
lapply(fread)Omitted since the process of constructing a DESeqDataSet from Salmon outputs has been described before
There’s a number of ways through which we can examine the correlation matrix
hr %>%
{.[upper.tri(., diag = T)] <- NA; .} %>%
rownames_to_column('s1') %>%
pivot_longer(-s1, names_to = 's2', values_to = 'c') %>%
na.omit() %>%
mutate(c1 = sub('_.*', '', s1),
c2 = sub('_.*', '', s2),
kind = ifelse(c1 == c2, 'same', 'diff')) %>%
plot_ly(x = ~kind, y = ~c, color = ~kind, type = 'box')tibble(s = names(hr)) %>%
mutate(type = factor(sub('_.*', '', s))) %>%
column_to_rownames('s') %>%
heatmaply(hr, row_side_colors = ., col_side_colors = .,
label_names = c("Sample 1", "Sample 2", "SCC"))sim2diss(hr, method = 'corr') %>%
mds(ndim = 2) %>%
.$conf %>%
as.data.frame() %>%
rownames_to_column('s') %>%
mutate(type = sub('_.*', '', s)) %>%
plot_ly(x = ~D1, y = ~D2, color = ~type, text = ~s) %>%
add_markers(legendgroup = ~type) %>%
add_text(textposition = 'top left', showlegend = F, legendgroup = ~type)For the contact probability decay curves and their derivatives that we’ve computed before, we can again just simply visualize them
# colors for each cell type
hclrs <- c(ESC = "#7cb5ec", NPC = "#434348")
# plot data
pd <- ps %>%
lapply(function(x) {
d <- rbindlist(x, idcol = 'samp')
ifelse('slope' %in% names(d), 'slope', 'balanced.avg') %>%
c('samp', 's_bp', .) %>%
d[, ., with = F] %>%
`colnames<-`(c('samp', 'x', 'y'))
}) %>%
rbindlist(idcol = 'kind') %>%
group_by(kind, samp) %>%
do(data = list_parse2(data.frame(.$x, .$y))) %>%
ungroup() %>%
separate(samp, c('ctype', 'rep'), '_', F, fill = 'right') %>%
#dplyr::filter(is.na(rep)) %>%
mutate(color = hclrs[ctype],
ctype = factor(ctype, names(hclrs))) %>%
arrange(ctype) %>%
mutate(name = samp,
id = ifelse(kind != 'log', tolower(name), NA),
linkedTo = ifelse(kind == 'log', tolower(name), NA),
yAxis = ifelse(kind == 'log', 0, 2))
highchart() %>%
# we use log scale for P(s) and linear for the slope (which was already taken in log space)
hc_yAxis_multiples(list(title = list(text = "<b>Contact probability</b>, P<sub>c</sub>(s)",
useHTML = TRUE),
type = "logarithmic",
labels = list(formatter = JS("function(){return this.value.toExponential(0);}")),
height = '45%', top = '0%', offset = 0),
list(height = '10%', top = '45%',
title = NULL,
plotLines = list(
list(color = "#a9a9a9", width = 2,
value = .5, zIndex = 1)
),
labels = list(enabled = F),
gridLineWidth = 0,
min = 0, max = 1),
list(type = "linear",
title = list(
text = "<b>Slope</b>, <sup>d</sup>⁄<sub>ds</sub> log P<sub>c</sub>(s)",
useHTML = TRUE
),
height = '45%', top = '55%', offset = 0)) %>%
# grab the data
hc_add_series_list(pd) %>%
# a bit of formatting
hc_xAxis(type = "logarithmic",
title = list(text = "<b>Genomic separation</b> (bp), s",
useHTML = T),
minorTickInterval = 'auto',
min = 1e4, max = 1e8) %>%
hc_tooltip(headerFormat = '<span style="font-size: 10px">{point.key:,.0f} bp</span><br/>') %>%
hc_chart(zoomType = "xy") %>%
hc_plotOptions(line = list(marker = list(enabled = F)))Next we’ll check out the first eigenvector tracks (i.e., “compartment score”)
d <- lapply(dat, `[[`, 'E1') %>% bind_cols() %>% na.omit()
x <- d$ESC
y <- d$NPC
subplot(
plot_ly(x = x, color = I(hclrs['ESC']), type = 'histogram'),
plotly_empty(),
plot_ly(x = x, y = y, type = 'histogram2dcontour', showscale = F,
contours = list(showlines = F), ncontours = 30),
plot_ly(y = y, color = I(hclrs['NPC']), type = 'histogram'),
nrows = 2, heights = c(0.2, 0.8), widths = c(0.8, 0.2),
shareX = T, shareY = T, titleX = F, titleY = F
) %>% layout(showlegend = FALSE, xaxis = list(title = 'ESC'), yaxis2 = list(title = 'NPC'))# edges
lnks <- d %>%
mutate(across(everything(), function(x) ifelse(x > 0, 'A', 'B'))) %>%
`colnames<-`(c('from', 'to')) %>%
dplyr::count(from, to, name = 'weight') %>%
mutate(from = sprintf('%s.%s', 'ESC', from),
to = sprintf('%s.%s', 'NPC', to),
weight = 100 * weight / sum(weight)) %>%
transpose()
# order of cell types
odr <- names(hclrs)
# nodes
nds <- lapply(lnks, function(x) c(x$from, x$to)) %>%
unlist() %>%
unique() %>%
lapply(function(x) {
tibble(id = x) %>%
separate(id, c('samp', 'name'), '\\.', F) %>%
mutate(column = which(odr == samp) - 1,
color = ifelse(name == 'A', '#6AA56E', '#B65655')) %>%
as.list()
})
# show as sankey
highchart() %>%
hc_chart(type = 'sankey') %>%
hc_add_series(data = lnks, nodes = nds) %>%
hc_xAxis(tickPositions = c(-0.49, 3.5),
startOnTicks = F,
endOnTicks = F,
lineColor = 'transparent',
labels = list(formatter = JS("function() {
var d;
if (this.value < 0) {
d = 'ES';
} else if (this.value > 3) {
d = 'NPC';
}
return d;
}"), align = 'center'),
tickLength = 0) %>%
hc_chart(showAxes = T) %>%
hc_tooltip(nodeFormatter = JS("function() {
return '<span style=\"font-size: 10px\">' + this.options.samp +
'</span><br/>' + this.options.name + ': ' +
Highcharts.numberFormat(this.sum, 1) + '%';
}"),
pointFormatter = JS("function() {
return '<span style=\"font-size: 10px\">' +
this.fromNode.options.samp + '→' +
this.toNode.options.samp + '</span><br/>' +
this.fromNode.options.name + '→' +
this.toNode.options.name + ': ' +
Highcharts.numberFormat(this.weight, 1) + '%';
}"),
headerFormat = '',
useHTML = T)dat$ESC %>%
bind_cols() %>%
na.omit() %>%
mutate(across(-E1, log10)) %>%
mutate(across(-E1, function(x) x - Input)) %>%
dplyr::filter(is.finite(rowSums(.))) %>%
dplyr::select(-Input) %>%
cor() %>%
heatmaply_cor()For the simplest comparison we can just count the number of shared boundaries
bdrs <- ins[c('ESC', 'NPC')] %>%
lapply(function(x) {
na.omit(x) %>%
dplyr::filter(boundary_strength_100000 > .5) %>%
mutate(start = start + 1) %>%
makeGRangesFromDataFrame()
})
Reduce(c, bdrs) %>%
unique() %>%
{lapply(bdrs, function(x) overlapsAny(., x))} %>%
bind_cols() %>%
euler() %>%
plot(quantities = T)There are specific classes in R that handles paired ranges, one of which is GInteractions
lps <- dots[c('ESC', 'NPC')] %>%
lapply(function(x) {
list(x[,1:3], x[,4:6]) %>%
lapply(function(y) {
y %>%
`colnames<-`(c('chr', 'start', 'end')) %>%
mutate(start = start + 1) %>%
makeGRangesFromDataFrame()
}) %>%
{GInteractions(.[[1]], .[[2]], mode = 'reverse')}
})
lps$ESC## ReverseStrictGInteractions object with 3738 interactions and 0 metadata columns:
## seqnames1 ranges1 seqnames2 ranges2
## <Rle> <IRanges> <Rle> <IRanges>
## [1] chr1 4750001-4775000 --- chr1 4500001-4525000
## [2] chr1 5000001-5025000 --- chr1 4475001-4500000
## [3] chr1 5900001-5925000 --- chr1 4475001-4500000
## [4] chr1 5900001-5925000 --- chr1 4750001-4775000
## [5] chr1 5900001-5925000 --- chr1 4900001-4925000
## ... ... ... ... ... ...
## [3734] chrX 167800001-167825000 --- chrX 167300001-167325000
## [3735] chrX 167800001-167825000 --- chrX 167475001-167500000
## [3736] chrX 168925001-168950000 --- chrX 168400001-168425000
## [3737] chrX 168925001-168950000 --- chrX 168700001-168725000
## [3738] chrX 169275001-169300000 --- chrX 168925001-168950000
## -------
## regions: 5537 ranges and 0 metadata columns
## seqinfo: 20 sequences from an unspecified genome; no seqlengths
Reduce(c, lps) %>%
unique() %>%
InteractionSet(matrix(1, length(.)), .) %>%
clusterPairs(tol = 1, upper = 1e6) %>%
.$interactions %>%
{lapply(lps, function(x) overlapsAny(., x))} %>%
bind_cols() %>%
euler() %>%
plot(quantities = T)As before, the results of differential interaction analysis can be visualized using the same types of figures we’ve used previously
res <- di %>%
lapply(function(x) {
results(x, contrast = c('condition', 'NPC', 'ESC')) %>%
as.data.frame() %>%
rownames_to_column('crd')
}) %>%
bind_rows(.id = 'chr') %>%
separate(crd, c('i', 'j'), ':') %>%
mutate(info = sprintf('%s:%s-%s<br>baseMean: %.3g<br>log2FoldChange: %.3g<br>FDR: %.3g',
chr, i, j, baseMean, log2FoldChange, padj))
plot_ly(res, x = ~baseMean, y = ~log2FoldChange, color = ~-log10(padj),
type = 'scatter', mode = 'markers', customdata = ~info,
hovertemplate = '%{customdata}<extra></extra>') %>%
layout(xaxis = list(type = 'log'))One simple integrative analysis is to assess the impact of differential long-range interaction in promoter regions on expression of the cognate gene.
# differential expressed genes
deg <- results(dds, contrast = c('condition', 'NPC', 'ESC')) %>%
as.data.frame() %>%
rownames_to_column('ID') %>%
na.omit() %>%
dplyr::filter(padj < .01 & abs(log2FoldChange) > 2) %>%
mutate(dir = ifelse(log2FoldChange > 0, 'up', 'dn')) %>%
dplyr::filter(ID %in% genes$ID[genes$gene_type == 'protein_coding']) %>%
{split(.$ID, .$dir)}
# genes with differential promoter long-range interaction
dig <- res %>%
na.omit() %>%
dplyr::filter(padj < .01 & abs(log2FoldChange) > .5) %>%
mutate(dir = ifelse(log2FoldChange > 0, 'up', 'dn')) %>%
split(., .$dir) %>%
lapply(function(x) {
list(x[,c(1,2)], x[,c(1,3)]) %>%
lapply(function(y) {
y[[3]] <- as.numeric(y[[2]]) + 2.5e4
colnames(y) <- c('chr', 'start', 'end')
y
}) %>%
bind_rows() %>%
mutate(start = as.numeric(start) + 1) %>%
makeGRangesFromDataFrame() %>%
{genes[overlapsAny(promoters(genes), .)]} %>%
{.[.$gene_type == 'protein_coding']} %>%
.$ID %>%
unique()
}) %>%
{lapply(., setdiff, intersect(.[[1]], .[[2]]))}
# total number of quantifiable genes
tot <- results(dds) %>%
na.omit() %>%
rownames() %>%
intersect(., genes$ID[genes$gene_type == 'protein_coding']) %>%
length()
# assess overlap
lapply(deg, function(x) {
lapply(dig, function(y) {
m <- tibble(a = length(intersect(x, y)),
b = length(x) - a,
c = length(y) - a,
d = tot - a - b - c)
m %>%
as.numeric() %>%
matrix(2, 2) %>%
fisher.test() %>%
tidy() %>%
cbind(m)
}) %>% bind_rows(.id = 'di')
}) %>% bind_rows(.id = 'de') %>%
mutate(grp = sprintf('Expression: %s<br>Interaction: %s', de, di),
info = sprintf('Odds ratio: %.3g (%.3g - %.3g)<br>P-value: %.3g<br>Overlap: %d',
estimate, conf.low, conf.high, p.value, a),
clo = estimate - conf.low,
chi = conf.high - estimate) %>%
plot_ly(x = ~grp, y = ~estimate, type = 'scatter', mode = 'markers', color = ~grp,
error_y = ~list(array = chi, arrayminus = clo, color = grp), showlegend = F,
customdata = ~info, hovertemplate = '%{customdata}<extra></extra>') %>%
layout(xaxis = list(title = NA),
yaxis = list(title = 'Odds ratio'),
shapes = list(type = 'line', line = list(color = 'black'), xref = 'paper', yref = 'y',
x0 = 0, x1 = 1, y0 = 1, y1 = 1, layer = 'below'))